Data Visualization with Pandas and Matplotlib¶

# import library 
import pandas as pd 
import matplotlib.pyplot as plt 

# display plot in the notebook 
%matplotlib inline 

# set figuresize and fontsize 
plt.rcParams['figure.figsize'] = (8,6) 
plt.rcParams['font.size'] = 14 

# read data 
drink_cols = ["country", 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks = pd.read_csv("../data/drinks.csv", header=0, names=drink_cols, na_filter=False)

Data Exploration¶

# examine first few rows 
drinks.head() 

	country	beer	spirit	wine	liters	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF
3	Andorra	245	138	312	12.4	EU
4	Angola	217	57	45	5.9	AF

# observations and columns 
drinks.shape

(193, 6)

# data structure 
drinks.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    193 non-null    object 
 1   beer       193 non-null    int64  
 2   spirit     193 non-null    int64  
 3   wine       193 non-null    int64  
 4   liters     193 non-null    float64
 5   continent  193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB

# numerical summary 
drinks.describe() 

	beer	spirit	wine	liters
count	193.000000	193.000000	193.000000	193.000000
mean	106.160622	80.994819	49.450777	4.717098
std	101.143103	88.284312	79.697598	3.773298
min	0.000000	0.000000	0.000000	0.000000
25%	20.000000	4.000000	1.000000	1.300000
50%	76.000000	56.000000	8.000000	4.200000
75%	188.000000	128.000000	59.000000	7.200000
max	376.000000	438.000000	370.000000	14.400000

Histogram: show the distribution of a numerical variable¶

# sort the beer columns and split it into 3 groups 
drinks.beer.sort_values().values

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   1,   1,   1,   1,   2,   3,   5,   5,   5,   5,   5,
         6,   6,   6,   6,   8,   8,   8,   9,   9,   9,   9,  12,  13,
        15,  15,  16,  16,  17,  18,  19,  19,  20,  20,  21,  21,  21,
        21,  22,  23,  25,  25,  25,  25,  26,  28,  31,  31,  31,  31,
        32,  32,  34,  36,  36,  36,  37,  42,  42,  43,  44,  45,  47,
        49,  51,  51,  52,  52,  52,  53,  56,  56,  57,  58,  60,  62,
        62,  63,  64,  69,  71,  76,  76,  77,  77,  77,  78,  79,  82,
        82,  85,  88,  89,  90,  92,  93,  93,  98,  99, 102, 105, 106,
       109, 111, 115, 120, 122, 124, 127, 128, 130, 133, 140, 142, 143,
       144, 147, 149, 149, 152, 157, 159, 162, 163, 167, 169, 171, 173,
       185, 188, 192, 193, 193, 194, 194, 196, 197, 199, 203, 206, 213,
       217, 219, 224, 224, 225, 230, 231, 233, 234, 236, 238, 240, 245,
       245, 247, 249, 251, 261, 263, 263, 270, 279, 281, 283, 284, 285,
       295, 297, 306, 313, 333, 343, 343, 346, 347, 361, 376])

# compare with histogram 
drinks.beer.plot(kind="hist", bins=3);

../_images/MPL02-Data Visualization with Pandas and Matplotlib_10_0.png

# try more bins 
drinks.beer.plot(kind="hist", bins=20); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_11_0.png

# add title and labels 
drinks.beer.plot(kind="hist", bins=20, title="Histogram of Beer Servings")
plt.xlabel("Beer Survings") 
plt.ylabel("Frequency")
# show plot 
plt.show() 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_12_0.png

# compare with density plot(smooth version of a histogram) 
drinks.beer.plot(kind="density", xlim=(0, 500));

../_images/MPL02-Data Visualization with Pandas and Matplotlib_13_0.png

Scatter Plot: show the relationship between two numerical variables¶

# select the beer and wine columns and sort by beer 
drinks[["beer", "wine"]].sort_values(by="beer").values

array([[  0,   0],
       [  0,  74],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  0,   0],
       [  1,   7],
       [  1,   1],
       [  1,   4],
       [  1,   1],
       [  2,   0],
       [  3,   1],
       [  5,   0],
       [  5,   0],
       [  5,  16],
       [  5,   1],
       [  5,   0],
       [  6,   1],
       [  6,   0],
       [  6,   1],
       [  6,   9],
       [  8,   0],
       [  8,   1],
       [  8,   1],
       [  9,   2],
       [  9,   0],
       [  9,   7],
       [  9,   0],
       [ 12,  10],
       [ 13,   0],
       [ 15,   3],
       [ 15,   1],
       [ 16,   5],
       [ 16,   0],
       [ 17,   1],
       [ 18,   0],
       [ 19,  32],
       [ 19,   2],
       [ 20,   0],
       [ 20,  31],
       [ 21,  11],
       [ 21,  11],
       [ 21,   5],
       [ 21,   1],
       [ 22,   1],
       [ 23,   0],
       [ 25,   8],
       [ 25,  14],
       [ 25,   2],
       [ 25,   7],
       [ 26,   4],
       [ 28,  21],
       [ 31, 128],
       [ 31,   6],
       [ 31,  10],
       [ 31,   1],
       [ 32,   4],
       [ 32,   1],
       [ 34,  13],
       [ 36,  19],
       [ 36,   5],
       [ 36,   1],
       [ 37,   7],
       [ 42,   2],
       [ 42,   7],
       [ 43,   0],
       [ 44,   1],
       [ 45,   0],
       [ 47,   5],
       [ 49,   8],
       [ 51,  20],
       [ 51,   7],
       [ 52,   2],
       [ 52, 149],
       [ 52,  26],
       [ 53,   2],
       [ 56, 140],
       [ 56,   1],
       [ 57,   1],
       [ 58,   2],
       [ 60,  11],
       [ 62,  18],
       [ 62, 123],
       [ 63,   9],
       [ 64,   4],
       [ 69,   2],
       [ 71,   1],
       [ 76,   8],
       [ 76,   9],
       [ 77,   8],
       [ 77,  16],
       [ 77,   1],
       [ 78,   1],
       [ 79,   8],
       [ 82,   9],
       [ 82,   0],
       [ 85, 237],
       [ 88,   0],
       [ 89,  54],
       [ 90,   2],
       [ 92, 233],
       [ 93,   5],
       [ 93,   1],
       [ 98,  18],
       [ 99,   1],
       [102,  45],
       [105,  24],
       [106,  86],
       [109,  18],
       [111,   1],
       [115, 220],
       [120,  11],
       [122,  51],
       [124,  12],
       [127, 370],
       [128,   7],
       [130, 172],
       [133, 218],
       [140,   9],
       [142,  42],
       [143,  36],
       [144,  16],
       [147,   4],
       [149, 120],
       [149,  11],
       [152, 186],
       [157,  51],
       [159,   3],
       [162,   3],
       [163,  21],
       [167,   8],
       [169, 129],
       [171,  71],
       [173,  35],
       [185, 280],
       [188,   7],
       [192, 113],
       [193,   9],
       [193, 221],
       [194, 339],
       [194,  32],
       [196, 116],
       [197,   7],
       [199,  28],
       [203, 175],
       [206,  45],
       [213,  74],
       [217,  45],
       [219, 195],
       [224,  59],
       [224, 278],
       [225,  81],
       [230, 254],
       [231,  94],
       [233,  78],
       [234, 185],
       [236, 271],
       [238,   5],
       [240, 100],
       [245, 312],
       [245,  16],
       [247,  73],
       [249,  84],
       [251, 190],
       [261, 212],
       [263,  97],
       [263,   8],
       [270, 276],
       [279, 191],
       [281,  62],
       [283, 127],
       [284, 112],
       [285,  18],
       [295, 212],
       [297, 167],
       [306,  23],
       [313, 165],
       [333,   3],
       [343,  56],
       [343,  56],
       [346, 175],
       [347,  59],
       [361, 134],
       [376,   1]])

# comapre with scatter plot 
drinks.plot(kind="scatter", x="beer", y="wine"); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_16_0.png

# add transparency 
drinks.plot(kind='scatter', x="beer", y="wine", alpha=0.3); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_17_0.png

# vary point color by spirit servings 
drinks.plot(kind="scatter", x="beer", y="wine", c="spirit", colormap="Blues"); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_18_0.png

# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']]); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_19_0.png

# increase figure size 
# scatter matrix of 3 numerical columns 
pd.plotting.scatter_matrix(drinks[['beer', 'wine', 'spirit']], figsize=(10,8)); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_20_0.png

Bar Plot: show a numerical comparison across different categories¶

# count the number of countries in each continent 
drinks.continent.value_counts()

AF    53
EU    45
AS    44
NA    23
OC    16
SA    12
Name: continent, dtype: int64

# compare with bar plot 
drinks.continent.value_counts().plot(kind="bar"); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_23_0.png

# calculate the mean alcohol amounts for each continent 
drinks.groupby('continent').mean() 

	beer	spirit	wine	liters
continent
AF	61.471698	16.339623	16.264151	3.007547
AS	37.045455	60.840909	9.068182	2.170455
EU	193.777778	132.555556	142.222222	8.617778
NA	145.434783	165.739130	24.521739	5.995652
OC	89.687500	58.437500	35.625000	3.381250
SA	175.083333	114.750000	62.416667	6.308333

# side-by-side bar plots 
drinks.groupby('continent').mean().plot(kind='bar'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_25_0.png

# drop the liters column
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_26_0.png

# stacked bar plots 
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_27_0.png

Box Plot: show quartiles (and outliers) for one or more numerical variables¶

Five-Number Summary

min = minimum value
5% = first quartile (Q1) = median of the lower half of the data
50% = second quartile (Q2) = median of the data
75% = third quartile (Q3) = median of the upper half of the data
max = maximum value (More useful than mean and standard deviation for describing skewed distributions)
Interquartile Range (IQR) = Q3 - Q1

Outliers

below Q1 - 1.5 * IQR
above Q3 + 1.5 * IQR

# sort the spirit column 
drinks.spirit.sort_values().values 

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,
         3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   5,   5,
         6,   6,   6,   7,   9,  11,  11,  12,  13,  15,  15,  16,  16,
        18,  18,  18,  18,  19,  21,  21,  22,  22,  25,  25,  27,  29,
        31,  31,  34,  35,  35,  35,  35,  38,  39,  41,  41,  42,  42,
        44,  46,  50,  51,  55,  56,  57,  60,  61,  63,  63,  65,  67,
        68,  69,  69,  69,  71,  71,  72,  74,  75,  76,  76,  79,  81,
        84,  87,  87,  88,  97,  97,  98,  98, 100, 100, 100, 100, 101,
       104, 104, 112, 114, 114, 114, 117, 117, 118, 118, 122, 122, 124,
       126, 128, 131, 132, 133, 133, 135, 137, 138, 145, 147, 151, 152,
       154, 156, 157, 158, 160, 170, 173, 173, 176, 178, 179, 186, 189,
       192, 194, 200, 202, 205, 215, 215, 216, 221, 226, 237, 244, 246,
       252, 254, 258, 286, 293, 302, 315, 326, 326, 373, 438])

# show five-number summary of spirit 
drinks.spirit.describe() 

count    193.000000
mean      80.994819
std       88.284312
min        0.000000
25%        4.000000
50%       56.000000
75%      128.000000
max      438.000000
Name: spirit, dtype: float64

# compare with boxplot 
drinks.spirit.plot(kind='box');  

../_images/MPL02-Data Visualization with Pandas and Matplotlib_31_0.png

# include multiple variables 
drinks.drop('liters', axis=1).plot(kind='box'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_32_0.png

Line Plot: show the trend of a numerical variable over time¶

# read ufo data 
ufo = pd.read_csv("../data/ufo.csv")
ufo['Time'] = pd.to_datetime(ufo.Time) 
ufo['Year'] = ufo.Time.dt.year 

# examine first few rows  
ufo.head() 

	City	Colors Reported	Shape Reported	State	Time	Year
0	Ithaca	NaN	TRIANGLE	NY	1930-06-01 22:00:00	1930
1	Willingboro	NaN	OTHER	NJ	1930-06-30 20:00:00	1930
2	Holyoke	NaN	OVAL	CO	1931-02-15 14:00:00	1931
3	Abilene	NaN	DISK	KS	1931-06-01 13:00:00	1931
4	New York Worlds Fair	NaN	LIGHT	NY	1933-04-18 19:00:00	1933

# observations and columns 
ufo.shape 

(80543, 6)

# data structure 
ufo.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80543 entries, 0 to 80542
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   City             80496 non-null  object        
 1   Colors Reported  17034 non-null  object        
 2   Shape Reported   72141 non-null  object        
 3   State            80543 non-null  object        
 4   Time             80543 non-null  datetime64[ns]
 5   Year             80543 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 3.7+ MB

# numerical summary 
ufo.describe()  

	Year
count	80543.000000
mean	2004.178737
std	10.602487
min	1930.000000
25%	2001.000000
50%	2007.000000
75%	2011.000000
max	2014.000000

# count the number of ufo reports each year (and sort by year)
ufo.Year.value_counts().sort_index() 

     2
     2
     1
     1
     1
        ... 
  4154
  5089
  7263
  7003
  5382
Name: Year, Length: 82, dtype: int64

# compare with line plot 
ufo.Year.value_counts().sort_index().plot();

../_images/MPL02-Data Visualization with Pandas and Matplotlib_40_0.png

# don't use a line plot when there is no logical ordering 
drinks.continent.value_counts().plot(kind='line'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_41_0.png

Grouped Box Plots: show one box plot for each group¶

# remainder: boxplot of beer survings 
drinks.beer.plot(kind='box'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_43_0.png

# boxplot of beer survings group by continent 
drinks.boxplot(column='beer', by='continent'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_44_0.png

# boxplot of all numerical columns group by continent 
drinks.boxplot(by='continent'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_45_0.png

Grouped Histograms: show one histogram for each group¶

# remainder: histogram of beer survings 
drinks.beer.plot(kind='hist', bins=20); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_47_0.png

# histogram of beer  survings group by continent 
drinks.hist(column='beer', by='continent'); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_48_0.png

# share the x-axis 
drinks.hist(column='beer', by='continent', sharex=True); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_49_0.png

# share the x and y axis 
drinks.hist(column='beer', by='continent', sharex=True, sharey=True); 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_50_0.png

# change the layout 
drinks.hist(column='beer', by='continent', sharex=True, layout=(2, 3));

../_images/MPL02-Data Visualization with Pandas and Matplotlib_51_0.png

Assorted Functionality¶

# saving a plot to a file 
drinks.beer.plot(kind='hist', bins=20, title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Freequency")
plt.savefig("beer_survings.png") # .png, .tiff, .pdf, .jpeg 

../_images/MPL02-Data Visualization with Pandas and Matplotlib_53_0.png

# list available plot style 
plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'tableau-colorblind10']

# use plot style: ggplot 
plt.style.use('ggplot')

# histogram of beer survings in ggplot style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")

Text(0, 0.5, 'Frequnecy')

../_images/MPL02-Data Visualization with Pandas and Matplotlib_56_1.png

# use plot style: ggplot 
plt.style.use('seaborn') 

# histogram of beer survings in seaborn style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")

Text(0, 0.5, 'Frequnecy')

../_images/MPL02-Data Visualization with Pandas and Matplotlib_58_1.png

# use plot style: ggplot 
plt.style.use('fivethirtyeight') 

# histogram of beer survings in fivethirtyeight style 
drinks.beer.plot(kind="hist", title="Histogram of Beer Survings")
plt.xlabel("Beer Survings")
plt.ylabel("Frequnecy")

Text(0, 0.5, 'Frequnecy')

../_images/MPL02-Data Visualization with Pandas and Matplotlib_60_1.png

Intro to Data Visualization with Matplotlib Introduction to Seaborn